{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.9"
    },
    "colab": {
      "name": "00_check-data.ipynb",
      "provenance": [],
      "collapsed_sections": []
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "UB6qenKgVyIG",
        "colab_type": "text"
      },
      "source": [
        "Copyright 2019 Google LLC\n",
        "\n",
        "Licensed under the Apache License, Version 2.0 (the \"License\"); you may not use this file except in compliance with the License. You may obtain a copy of the License at\n",
        "\n",
        "https://www.apache.org/licenses/LICENSE-2.0\n",
        "Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "CpPQDKFaVyII",
        "colab_type": "text"
      },
      "source": [
        "Make sure that jupyter is installed by running below command (it will allow to create folders in user dir):\n",
        "\n",
        "```shell\n",
        "pip install jupyter --user\n",
        "```"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "dGYS0NKeVyIJ",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "!git clone https://github.com/google-research/google-research.git"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "LutDR7qoVyIQ",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import sys\n",
        "import os\n",
        "import tarfile\n",
        "import urllib\n",
        "sys.path.append('./google-research')"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "t8e_WfgaVyIV",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"-1\"  # no need to use gpu"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "2vxfxt-VVyIa",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import tensorflow as tf\n",
        "import tensorflow.compat.v1 as tf1\n",
        "import logging\n",
        "from kws_streaming.models import model_flags\n",
        "from kws_streaming.models import models\n",
        "from kws_streaming.layers.modes import Modes\n",
        "from kws_streaming.train import test\n",
        "from kws_streaming.models import utils\n",
        "from kws_streaming.data import input_data\n",
        "from kws_streaming.data import input_data_utils as du\n",
        "from kws_streaming.models import model_params"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "jIkBmOt6VyIe",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "tf1.__version__"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "sSETc8rdVyIl",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "config = tf1.ConfigProto()\n",
        "config.gpu_options.allow_growth = True\n",
        "sess = tf1.Session(config=config)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "dnvm8YMlVyIq",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "tf1.disable_eager_execution()"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "yjvGL2IgVyIw",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "DATA_VERSION = 2"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "smFll0s2VyI0",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "current_dir = os.getcwd()\n",
        "\n",
        "if DATA_VERSION == 2:\n",
        "  DATA_URL = \"https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz\"\n",
        "  DATA_PATH = os.path.join(current_dir, \"data2/\")\n",
        "elif DATA_VERSION == 1:\n",
        "  DATA_URL = \"http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz\"\n",
        "  DATA_PATH = os.path.join(current_dir, \"data1/\")\n",
        "else:\n",
        "  assert(False)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "2Ap7kbKPVyI4",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "DATA_PATH"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "HCFAzvyfVyI8",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# create folder in current dir.\n",
        "# not data will be downloaded in DATA_PATH, feel free to specify your own DATA_PATH\n",
        "os.makedirs(DATA_PATH)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "1232Ifp3VyI_",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "base_name = os.path.basename(DATA_URL)\n",
        "base_name"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Kf5YF4SsVyJD",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# it can take some time to download 2.3GB. After unpacking total size is 5.4GB\n",
        "arch_file_name = os.path.join(DATA_PATH, base_name)\n",
        "if not os.path.isfile(arch_file_name):\n",
        "  # download data\n",
        "  if sys.version_info >= (2, 5):\n",
        "    file_path = urllib.request.urlretrieve(DATA_URL, filename=arch_file_name)[0]\n",
        "  else:\n",
        "    file_path = urllib.urlretrieve(DATA_URL, filename=arch_file_name)[0]\n",
        "\n",
        "  # unpack it\n",
        "  file_name, file_extension = os.path.splitext(base_name)\n",
        "  tar = tarfile.open(file_path)\n",
        "  tar.extractall(DATA_PATH)\n",
        "  tar.close()"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "T8z0-SnfVyJG",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# default parameters for data splitting\n",
        "flags = model_params.Params()\n",
        "flags.data_dir = DATA_PATH\n",
        "flags.data_url = DATA_URL\n",
        "flags = model_flags.update_flags(flags)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "TsRktp1mVyJK",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "audio_processor = input_data.AudioProcessor(flags)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ZXABxi76VyJO",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "testing_set_size = audio_processor.set_size('testing')\n",
        "print(\"testing_set_size \" + str(testing_set_size))\n",
        "training_set_size = audio_processor.set_size('training')\n",
        "print(\"training_set_size \" + str(training_set_size))\n",
        "validation_set_size = audio_processor.set_size('validation')\n",
        "print(\"validation_set_size \" + str(validation_set_size))"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "kDC0kbR2VyJR",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# V2\n",
        "# testing_set_size 4890\n",
        "# training_set_size 36923\n",
        "# validation_set_size 4445\n",
        "\n",
        "# V1\n",
        "# testing_set_size 3081\n",
        "# training_set_size 22246\n",
        "# validation_set_size 3093"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "RocIfpAtVyJU",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# all words used for modeling: we have target words + unknown words (with index 1)\n",
        "audio_processor.word_to_index"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "iea72Q5oVyJY",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# find the start of the file name where label begins\n",
        "string = audio_processor.data_index[\"validation\"][0]['file']\n",
        "res = [i for i in range(len(string)) if string.startswith('/', i)] \n",
        "start_file = res[-2]+1\n",
        "start_file"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "DQWh0HF0VyJb",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "audio_processor.data_index[\"validation\"][0]['file'][start_file:]"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "LdihI451VyJe",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "index_to_label = {}\n",
        "unknown_category = []\n",
        "# labeles used for training\n",
        "for word in audio_processor.word_to_index.keys():\n",
        "  if audio_processor.word_to_index[word] == du.SILENCE_INDEX:\n",
        "    index_to_label[audio_processor.word_to_index[word]] = du.SILENCE_LABEL\n",
        "  elif audio_processor.word_to_index[word] == du.UNKNOWN_WORD_INDEX:\n",
        "    index_to_label[audio_processor.word_to_index[word]] = du.UNKNOWN_WORD_LABEL\n",
        "    unknown_category.append(word)\n",
        "  else:\n",
        "    index_to_label[audio_processor.word_to_index[word]] = word\n",
        "\n",
        "# training labels\n",
        "index_to_label"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "vsgt_OpnVyJh",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# words belonging to unknown categry\n",
        "unknown_category"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "JDCSwc9SVyJk",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "def get_distribution(mode):\n",
        "  distrib_label = {}\n",
        "  distrib_words = {}\n",
        "  files = {}\n",
        "  for data in audio_processor.data_index[mode]:\n",
        "    word = data['label']\n",
        "    file = data['file'][start_file:]\n",
        "    index = audio_processor.word_to_index[word]\n",
        "    label = index_to_label[index]\n",
        "    if word in files:\n",
        "      files[word].append(file)\n",
        "    else:\n",
        "      files[word] = [file]\n",
        "\n",
        "    if label in distrib_label:\n",
        "      distrib_label[label] = distrib_label[label] + 1\n",
        "    else:\n",
        "      distrib_label[label] = 1\n",
        "\n",
        "    if word in distrib_words:\n",
        "      distrib_words[word] = distrib_words[word] + 1\n",
        "    else:\n",
        "      distrib_words[word] = 1\n",
        "  return distrib_words, distrib_label, files"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "fwrCIcC0VyJn",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# distribution of labeles in testing data\n",
        "distrib_words_testing, distrib_labels_testing, files_testing = get_distribution('testing')\n",
        "distrib_labels_testing"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "qG0cJ1ifVyJr",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# distribution of labeles in training data\n",
        "distrib_words_training, distrib_labels_training, files_training = get_distribution('training')\n",
        "distrib_labels_training"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "GRsumUc0VyJv",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "def parse_files(set_list_fname, label='yes'):\n",
        "  set_files = []\n",
        "  with open(set_list_fname) as f:\n",
        "    while True:\n",
        "      line = f.readline()\n",
        "      if not line:\n",
        "        break\n",
        "      if line.split('/')[0] == label:\n",
        "        set_files.append(line[:-1])\n",
        "  return set_files"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "iHw_eD2rVyJy",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "def validate(my_list1, list2, print_in_list2=False):\n",
        "  cnt_my_val2 = 0\n",
        "  cnt_my_val1 = 0\n",
        "  for my_val in my_list1:\n",
        "    if my_val in list2:\n",
        "      cnt_my_val2 = cnt_my_val2 + 1\n",
        "      if print_in_list2:\n",
        "        print(my_val)\n",
        "    else:\n",
        "      cnt_my_val1 = cnt_my_val1 + 1\n",
        "      if not print_in_list2:\n",
        "        print(my_val)\n",
        "  return cnt_my_val1, cnt_my_val2"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "CSbC4wmmVyJ2",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "file_list = os.path.join(DATA_PATH, \"testing_list.txt\")\n",
        "\n",
        "# validate that all wav used during testing belongs to testing_list\n",
        "for word in files_testing.keys():\n",
        "  if word != '_silence_':\n",
        "    word_files = parse_files(file_list, label=word)\n",
        "    _, cnt_val = validate(files_testing[word], word_files, False)\n",
        "    assert(cnt_val-len(files_testing[word])==0)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "2Q8zKJWwVyJ4",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "\n",
        "distrib_words_training, distrib_labels_training, files_training = get_distribution('training')\n",
        "\n",
        "# validate that all wav used during testing do not belong to training data\n",
        "for word in files_testing.keys():\n",
        "  if word != '_silence_': # silence file does not matter becasue it is multiplied by zero\n",
        "    word_files = files_testing[word]\n",
        "    _, cnt_val = validate(files_training[word], word_files, True)\n",
        "    assert(cnt_val == 0)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "nQqB8IqLVyJ7",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        ""
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
}
